*//////////////////////////////////////////////////////////////////////////////
*                                                                             /         
*				High-Pressure, High-Paying Jobs?                              /
*               The Review of Economics and Statistics                        /
*                                                                             /
*               Authors: Markus Nagler, Johannes Rincke, Erwin Winkler        /
*                                                                             /
*               Do-File 6: Random forest prediction of pressure index         /
*                                                                             /
*               This file generates:                                          /
*               - Figure A.3                                                  /
*                                                                             /
*                                                                             /
*//////////////////////////////////////////////////////////////////////////////


use "$data\bibb_2018_processed.dta", replace

label var d_codifiable "Codifiability of job"
label var d_routine "Routine-intensity of job"
label var female "Gender"
label var high_edu "University degree"
label var med_edu "Vocational degree"
label var number_sub "Number of subordinates"
label var german "German nationality"
label var temp_contract "Temporary contract"


*////////////////////////////////////////
*                                       /
* Construct health index using PCA      /
*                                       /
*////////////////////////////////////////

pca nosleep tired nervous exhausted_mental exhausted_phys d_taxing d_relax toomuchwork stress_increase d_limit
predict health_index
drop if missing(health_index)
sum health_index 
replace health_index = (health_index-r(mean))/r(sd)
sum health_index



*///////////////////////////////////////////////////////////////////
*                                                                  /
* Randomly split dataset into training and test data (50-50)       /
*                                                                  /
*///////////////////////////////////////////////////////////////////

clear matrix
set seed 141015

gen rvar = uniform()
gen training=0
gen test=0
sum rvar, detail
replace training=1 if rvar<=r(p50)
replace test=1 if rvar>r(p50)
sort test


set seed 141015 
gen u = uniform()
sort u
sum u

gen randomvar = uniform()
label var randomvar "Random variable"
sum randomvar






*** Baseline pressure index

preserve

rforest pressure high_edu med_edu age female german works_council temp_agency commute temp_contract firmsize normal_workhours shift standby number_sub d_routine d_codifiable computer phys_index randomvar if test==1, type(reg) iterations(3000) numvars(4) lsize(25) seed(141015) numdec(10)


matrix list e(importance)

matrix importance = e(importance)
svmat importance
list importance in 1/5
gen id=""


        local mynames : rownames importance
        local k : word count `mynames'
            // If there are more variables than observations
            if `k'>_N {
                set obs `k'
            }
            forvalues i = 1(1)`k' {
                local aword : word `i' of `mynames'
                local alabel : variable label `aword'
                if ("`alabel'"!="") qui replace id= "`alabel'" in `i'
                else qui replace id= "`aword'" in `i'
            }

graph hbar (mean) importance, over(id, sort(1)) ytitle(Prediction of pressure index: variable importance) yscale(range(0 1)  titlegap(4))


restore




*** Dropping minimum requirements from index

preserve

rforest pressure3 high_edu med_edu age female german works_council temp_agency commute temp_contract firmsize normal_workhours shift standby number_sub d_routine d_codifiable computer phys_index randomvar if test==1, type(reg) iterations(3000) numvars(4) lsize(25) seed(141015) numdec(10)


matrix list e(importance)

matrix importance = e(importance)
svmat importance
list importance in 1/5
gen id=""


        local mynames : rownames importance
        local k : word count `mynames'
            // If there are more variables than observations
            if `k'>_N {
                set obs `k'
            }
            forvalues i = 1(1)`k' {
                local aword : word `i' of `mynames'
                local alabel : variable label `aword'
                if ("`alabel'"!="") qui replace id= "`alabel'" in `i'
                else qui replace id= "`aword'" in `i'
            }

graph hbar (mean) importance, over(id, sort(1)) ytitle(Prediction of pressure index: variable importance) yscale(range(0 1)  titlegap(4))


restore




*** Baseline pressure index - within occupations and industries

preserve

* Residualize from occ and ind dummies 
reg pressure d_occ2d_* d_ind_* [aw=weight]
predict pressure_res, r

rforest pressure_res high_edu med_edu age female german works_council temp_agency commute temp_contract firmsize normal_workhours shift standby number_sub d_routine d_codifiable computer phys_index randomvar if test==1, type(reg) iterations(3000) numvars(4) lsize(25) seed(141015) numdec(10)


matrix list e(importance)

matrix importance = e(importance)
svmat importance
list importance in 1/5
gen id=""


        local mynames : rownames importance
        local k : word count `mynames'
            // If there are more variables than observations
            if `k'>_N {
                set obs `k'
            }
            forvalues i = 1(1)`k' {
                local aword : word `i' of `mynames'
                local alabel : variable label `aword'
                if ("`alabel'"!="") qui replace id= "`alabel'" in `i'
                else qui replace id= "`aword'" in `i'
            }

graph hbar (mean) importance, over(id, sort(1)) ytitle(Prediction of pressure index: variable importance) yscale(range(0 1)  titlegap(4))


restore






*** Dropping minimum requirements from index - within occupations and industries

preserve

* Residualize from occ and ind dummies 
reg pressure3 d_occ2d_* d_ind_* [aw=weight]
predict pressure3_res, r

rforest pressure3_res high_edu med_edu age female german works_council temp_agency commute temp_contract firmsize normal_workhours shift standby number_sub d_routine d_codifiable computer phys_index randomvar if test==1, type(reg) iterations(3000) numvars(4) lsize(25) seed(141015) numdec(10)


matrix list e(importance)

matrix importance = e(importance)
svmat importance
list importance in 1/5
gen id=""


        local mynames : rownames importance
        local k : word count `mynames'
            // If there are more variables than observations
            if `k'>_N {
                set obs `k'
            }
            forvalues i = 1(1)`k' {
                local aword : word `i' of `mynames'
                local alabel : variable label `aword'
                if ("`alabel'"!="") qui replace id= "`alabel'" in `i'
                else qui replace id= "`aword'" in `i'
            }

graph hbar (mean) importance, over(id, sort(1)) ytitle(Prediction of pressure index: variable importance) yscale(range(0 1)  titlegap(4))
graph close

restore




















